In [1]:
    
import pymongo
import datetime
import numpy as np
import matplotlib.pyplot as plt
    
In [2]:
    
client = pymongo.MongoClient("46.101.236.181")
db = client.allfake
# get collection names
collections = sorted([collection for collection in db.collection_names()])
    
In [4]:
    
day = {} # number of tweets per day per collection
diff = {} # cumullative diffusion on day per colletion
for collection in collections:
    
    # timeframe
    relevant_from = db[collection].find().sort("timestamp", pymongo.ASCENDING).limit(1)[0]['timestamp']
    relevant_till = db[collection].find().sort("timestamp", pymongo.DESCENDING).limit(1)[0]['timestamp']
    
    i = 0
    day[collection] = [] # number of tweets for every collection for every day
    diff[collection] = [] # cummulative diffusion for every collection for every day
    averagediff = [] # average diffusion speed for every day for all news
    
    d = relevant_from
    delta = datetime.timedelta(days=1)
    while d <= relevant_till:
        
        # tweets per day per collection
        day[collection].append(db[collection].find({"timestamp":{"$gte": d, "$lt": d + delta}}).count())
        
        # cummulative diffusion per day per collection
        if i == 0:
            diff[collection].append( day[collection][i] )
        else:
            diff[collection].append( diff[collection][i-1] + day[collection][i] )
        d += delta
        i += 1
    
In [5]:
    
# the longest duration of diffusion among all news headlines 
max_days = max([len(day[coll]) for coll in \
                         [days_col for days_col in day] ])
summ_of_diffusions = [0] * max_days # summary diffusion for every day
# calculate summary diffusion for every day
for d in range(max_days):
    for c in collections:
        # if there is an entry for this day for this collection, add its number of tweets to the number of this day
        if d < len(day[c]): 
            summ_of_diffusions[d] += day[c][d]
plt.step(range(len(summ_of_diffusions)),summ_of_diffusions, 'g')
plt.xlabel('Day') 
plt.ylabel('Number of tweets')
plt.title('Diffusion of all real news together')
plt.show()
    
    
In [6]:
    
summ_of_diffusions_cumulative = [0] * max_days # 
summ_of_diffusions_cumulative[0] = summ_of_diffusions[0]
for d in range(1, max_days):
    summ_of_diffusions_cumulative[d] += summ_of_diffusions_cumulative[d-1] + summ_of_diffusions[d]
plt.step(range(len(summ_of_diffusions_cumulative)),summ_of_diffusions_cumulative, 'g')
plt.xlabel('Day') 
plt.ylabel('Cummulative number of tweets')
plt.title('Cummulative diffusion of all real news together')
plt.show()
    
    
In [7]:
    
for collection in collections:
    plt.step([d+1 for d in range(len(diff[collection]))], diff[collection])
    
plt.xlabel('Day') 
plt.ylabel('Cummulative number of tweets')
plt.title('Cumulative diffusion of real news headlines')
plt.show()
    
    
In [8]:
    
averagediff = [0 for _ in range(max_days)] # average diffusion for every day
for collection in collections:
    for i,d in enumerate(day[collection]):
        averagediff[i] += d / len(collections)
plt.xlabel('Day') 
plt.ylabel('Average number of tweets')
plt.step(range(1,len(averagediff)+1),averagediff, 'g')
plt.title('Average diffusion of real news')
plt.show()
    
    
In [9]:
    
plt.ylabel('Average number of tweets')
plt.xlabel('Day')
plt.yscale('log')
plt.step(range(1,len(averagediff)+1),averagediff, 'g')
plt.show()
    
    
In [10]:
    
avgdiff_std = [0 for _ in range(max_days)] # standard deviation for every day for all collections
number_tweets = [[] for _ in range(max_days)] # number of tweets for every day for every collection
for d in range(max_days):
    for c in collections:
        # if there is an entry for this day for this collection
        if d < len(day[c]): 
            # add number of tweets for this day for this colletion to the number_tweets for this day 
            number_tweets[d].append(day[c][d])     
    
    # calculate standard deviation for this day
    avgdiff_std[d] = np.std(number_tweets[d])
    
    
plt.ylabel('Standart deviation for average number of tweets per day')
plt.xlabel('Day')
plt.step(range(1,len(avgdiff_std)+1),avgdiff_std, 'g')
plt.title('Standard deviation for real news average')
plt.show()
    
    
In [11]:
    
inside_std = [0 for _ in range(max_days)] # number of values inside one standard deviation for every day
inside_std_share = [0 for _ in range(max_days)] # share of values inside one standard deviation for every day
for d in range(max_days):
    for c in collections:
        # set borders of mean plusminus one std
        lowest = averagediff[d] - avgdiff_std[d]
        highest = averagediff[d] + avgdiff_std[d]
        
        # if there is entray for this day for this collection and its value is inside the borderes
        if d < len(day[c]) and (day[c][d] >= lowest and day[c][d] <= highest):
            
            # increment number of values inside one std for this day
            inside_std[d] += 1
    
    # calculate the share of values inside one std for this day
    inside_std_share[d] = inside_std[d] / float(len(number_tweets[d]))
    
plt.ylabel('Percent of values in 1 std from average')
plt.xlabel('Day')
plt.scatter(range(1,len(inside_std_share)+1),inside_std_share, c='g')
plt.title('Percentage of values inside the range\n of one standard deviation from mean for real news')
plt.show()
    
    
In [15]:
    
averagediff_real = averagediff
%store averagediff_real
    
    
In [16]:
    
# from hard drive, load data for average diffusion of fake news
%store -r averagediff_fake
plt.xlabel('Day') 
plt.ylabel('Average number of tweets')
plt.step(range(1,len(averagediff)+1),averagediff, 'g', label="real news")
plt.step(range(1,len(averagediff_fake)+1),averagediff_fake, 'r', label="fake news")
plt.legend()
plt.title('Average diffusion for both types of news')
plt.show()
    
    
In [17]:
    
plt.ylabel('Average number of tweets')
plt.xlabel('Day')
plt.yscale('log')
plt.step(range(1,len(averagediff_fake)+1),averagediff_fake, 'r', range(1,len(averagediff)+1),averagediff, 'g')
plt.show()
    
    
In [19]:
    
diffDurationAvg = 0; # average duration of diffusion
durations  = [len(day[col]) for col in collections] # all durations
diffDurationAvg = np.mean(durations) # mean duration
diffDurationAvg_std = np.std(durations) # standard deviation for the mean
    
print "Average diffusion duration: %.2f days" % diffDurationAvg
print "Standard deviation: %.2f days" % diffDurationAvg_std
    
    
In [ ]: